Set/check knitR option and working directory

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(harrietr)
## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape
library(here)
## here() starts at /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github
library(ggtree)
## ggtree v2.0.1  For help: https://yulab-smu.github.io/treedata-book/
## 
## If you use ggtree in published research, please cite the most appropriate paper(s):
## 
## - Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## - Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:magrittr':
## 
##     inset
## The following object is masked from 'package:tidyr':
## 
##     expand
library(phytools)
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following object is masked from 'package:ggtree':
## 
##     rotate
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## Loading required package: maps
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github"
rm(list = ls())

Import raw data

Snippy output

snippy_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_pair_id_snps.mask.tab") %>%
  arrange(PAIR_ID)
## Parsed with column specification:
## cols(
##   PAIR_ID = col_character(),
##   REFERENCE = col_character(),
##   ISOLATE = col_character(),
##   CHROM = col_character(),
##   POS = col_double(),
##   TYPE = col_character(),
##   REF = col_character(),
##   ALT = col_character(),
##   EVIDENCE = col_character(),
##   FTYPE = col_character(),
##   STRAND = col_character(),
##   NT_POS = col_character(),
##   AA_POS = col_character(),
##   EFFECT = col_character(),
##   LOCUS_TAG = col_character(),
##   GENE = col_character(),
##   PRODUCT = col_character()
## )
snippy_data 
## # A tibble: 142,030 x 17
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 cont… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 cont…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 cont…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 cont…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 cont… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 cont… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 cont… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 cont… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 cont…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 cont…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 142,020 more rows, and 6 more variables: NT_POS <chr>, AA_POS <chr>,
## #   EFFECT <chr>, LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>
# modify snippy output: 
# 1) generate iso1 and iso2 for merging with the phenotypic analysis
# 2) modify CHROM (contig name) to be consistent with the labelling in bed files down the track
# 3) extract mutation effects for easier interpretation


source("../Functions/aa_convert.R")
snippy_data_modified <- snippy_data %>%
  mutate(iso1 = str_remove(REFERENCE, ".gbk"),
         iso2 = ISOLATE,
         CHROM = str_c(iso1, "_", CHROM)) %>%
  separate(EFFECT, 
           into = c("EFFTYPE", "NUCLEOTIDE_CHANGE", "MUTATION"), 
           sep = "\\s", 
           remove = T, 
           extra = "merge") %>%
  mutate(NUCLEOTIDE_CHANGE = str_remove(NUCLEOTIDE_CHANGE, "c."),
         MUTATION = str_remove(MUTATION, "p."),
         MUTATION_SHORT = aa_convert(MUTATION)) %>%
  separate(AA_POS, 
           into = c("AA_POS", "AA_LENGTH"), 
           sep = "/", 
           remove = T) %>%
  mutate_at(vars(starts_with("AA")), 
            as.numeric)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 488 rows [458,
## 459, 554, 555, 940, 941, 942, 943, 944, 994, 995, 996, 997, 998, 1048, 1049,
## 1050, 1051, 1052, 5899, ...].
## Parsed with column specification:
## cols(
##   `Amino acid` = col_character(),
##   `Three letter code` = col_character(),
##   `One letter code` = col_character()
## )
snippy_data_modified
## # A tibble: 142,030 x 23
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 BPH2…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 BPH2…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 BPH2…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 BPH2…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 BPH2…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 142,020 more rows, and 12 more variables: NT_POS <chr>, AA_POS <dbl>,
## #   AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## #   LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## #   MUTATION_SHORT <chr>
snippy_data_modified %>%
  .$ISOLATE %>%
  n_distinct()
## [1] 253
df_n_mutations <- snippy_data_modified %>%
  count(PAIR_ID, sort = T)
df_n_mutations %>%
  ggplot(aes(x = n)) +
  # geom_histogram() +
  geom_density() +
  theme_bw()

distant_pairs <- df_n_mutations %>%
  filter(n > 100) %>%
  .$PAIR_ID
snippy_data_modified_no_distant_pairs <- snippy_data_modified %>%
  filter(!PAIR_ID %in% distant_pairs)
rm(distant_pairs, df_n_mutations)

clustering of protein genes (cd-hit)

protein_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.cd-hit.tab") %>%
  rename_all(funs(str_c(., "_prot"))) 
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
nebraska_clusters <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/representative_proteins_FPR3757_cd-hit.tab") %>%
  mutate(source = if_else(str_detect(id, "SAUSA300"), "FPR3757", "mutated_proteins")) %>%
  group_by(clstr) %>%
  filter(any(str_detect(id, "BPH"))) %>%
  group_by(clstr, source) %>%
  arrange(desc(clstr_cov, clstr_id)) %>%
  filter(!(source == "FPR3757" & row_number() > 1)) %>%
  mutate(source_long = str_c(source, "_", row_number())) %>%
  ungroup() %>%
  select(id, clstr, source_long) %>%
  pivot_wider(names_from = source_long, values_from = id) %>%
  pivot_longer(cols = starts_with("mutated_proteins"), names_to = "source_long", values_to = "id_prot") %>%
  drop_na(id_prot) %>%
  select(id_prot, nebraska_locus_tag = FPR3757_1) 
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
df_neb <- read_csv("Ideas_Grant_2020_analysis/Raw_data/nebraska_all_proteins.csv") %>%
  rename(neb_mutant_id = `Strain Name`, neb_gene = `Gene name`, neb_product = `gene discription`)
## Parsed with column specification:
## cols(
##   `Strain Name` = col_character(),
##   plate384 = col_double(),
##   Well384 = col_character(),
##   `Gene name` = col_character(),
##   `gene discription` = col_character(),
##   nebraska_locus_tag = col_character(),
##   Row384 = col_character(),
##   Column384 = col_double(),
##   nebraska_aa_seq = col_character()
## )
protein_clusters_data <- protein_clusters_data %>%
  left_join(nebraska_clusters) %>%
  group_by(clstr_prot) %>%
  mutate(nebraska_locus_tag = nebraska_locus_tag[which(clstr_rep_prot == 1)]) %>%
  left_join(df_neb)
## Joining, by = "id_prot"
## Joining, by = "nebraska_locus_tag"
protein_seq_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.tab") %>%
  rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
##   FASTA_ID = col_character(),
##   SEQUENCE = col_character()
## )

merge clusters and sequences with snippy data

snippy_data_modified_proteins <- snippy_data_modified_no_distant_pairs %>%
  left_join(protein_clusters_data,
            by = c("LOCUS_TAG" = "id_prot")) %>%
  left_join(protein_seq_data,
            by = c("LOCUS_TAG" = "FASTA_ID_prot"))
snippy_data_modified_proteins
## # A tibble: 104,035 x 39
##    PAIR_ID REFERENCE ISOLATE CHROM   POS TYPE  REF   ALT   EVIDENCE FTYPE STRAND
##    <chr>   <chr>     <chr>   <chr> <dbl> <chr> <chr> <chr> <chr>    <chr> <chr> 
##  1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp   T     A     A:23 T:0 <NA>  <NA>  
##  2 GP-0002 BPH2705.… BPH2704 BPH2…    14 snp   G     T     T:29 G:0 <NA>  <NA>  
##  3 GP-0002 BPH2705.… BPH2704 BPH2…   521 snp   C     A     A:23 C:0 <NA>  <NA>  
##  4 GP-0002 BPH2705.… BPH2704 BPH2…   389 snp   T     A     A:18 T:0 <NA>  <NA>  
##  5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp   A     C     C:25 A:0 CDS   +     
##  6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp   C     A     A:21 C:0 CDS   +     
##  7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp   C     A     A:41 C:0 CDS   +     
##  8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp   A     C     C:38 A:0 CDS   +     
##  9 GP-0002 BPH2705.… BPH2704 BPH2…  2728 snp   A     T     T:18 A:0 <NA>  <NA>  
## 10 GP-0002 BPH2705.… BPH2704 BPH2…  2695 snp   C     A     A:20 C:0 <NA>  <NA>  
## # … with 104,025 more rows, and 28 more variables: NT_POS <chr>, AA_POS <dbl>,
## #   AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## #   LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## #   MUTATION_SHORT <chr>, clstr_prot <dbl>, clstr_size_prot <dbl>,
## #   length_prot <dbl>, clstr_rep_prot <dbl>, clstr_iden_prot <chr>,
## #   clstr_cov_prot <chr>, nebraska_locus_tag <chr>, neb_mutant_id <chr>,
## #   plate384 <dbl>, Well384 <chr>, neb_gene <chr>, neb_product <chr>,
## #   Row384 <chr>, Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>

clustering and annotation of intergenic regions

gff with annotation of intergenic regions

col_names <- c("CHROM_intergenic", 
               "START", 
               "END", 
               "CHROM_protein", 
               "SOURCE", 
               "TYPE", 
               "START_flank_prot", 
               "END_flank_prot", 
               "SCORE", 
               "STRAND_flank_prot", 
               "PHASE", 
               "ATTRIBUTES_flank_prot",  
               "CHROM_mutation",
               "POS_minus1",
               "POS", 
               "PAIR_ID", 
               "REFERENCE", 
               "ISOLATE")

intergenic_regions_annotated <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.annotated.bed",
                                         col_names = col_names) %>%
  select(CHROM = CHROM_intergenic, 
         START, 
         END, 
         START_flank_prot, 
         END_flank_prot, 
         STRAND_flank_prot, 
         ATTRIBUTES_flank_prot, 
         POS, 
         PAIR_ID, 
         REFERENCE, 
         ISOLATE) 
## Parsed with column specification:
## cols(
##   CHROM_intergenic = col_character(),
##   START = col_double(),
##   END = col_double(),
##   CHROM_protein = col_character(),
##   SOURCE = col_character(),
##   TYPE = col_character(),
##   START_flank_prot = col_double(),
##   END_flank_prot = col_double(),
##   SCORE = col_character(),
##   STRAND_flank_prot = col_character(),
##   PHASE = col_double(),
##   ATTRIBUTES_flank_prot = col_character(),
##   CHROM_mutation = col_character(),
##   POS_minus1 = col_double(),
##   POS = col_double(),
##   PAIR_ID = col_character(),
##   REFERENCE = col_character(),
##   ISOLATE = col_character()
## )
upstream_proteins <- intergenic_regions_annotated %>%
  group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
  filter(END_flank_prot < POS) %>%
  rename_at(vars(ends_with("_flank_prot")),
            funs(str_c(., "_upstream")))
downstream_proteins <- intergenic_regions_annotated %>%
  group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
  filter(START_flank_prot > POS) %>%
  rename_at(vars(ends_with("_flank_prot")),
            funs(str_c(., "_downstream")))
intergenic_regions_annotated <- upstream_proteins %>%
  left_join(downstream_proteins)
## Joining, by = c("CHROM", "START", "END", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")
intergenic_regions_annotated
## # A tibble: 42,815 x 15
## # Groups:   CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE [42,815]
##    CHROM  START    END START_flank_pro… END_flank_prot_… STRAND_flank_pr…
##    <chr>  <dbl>  <dbl>            <dbl>            <dbl> <chr>           
##  1 BPH2… 213895 214187           212549           213895 +               
##  2 BPH2… 162407 162758           162189           162407 -               
##  3 BPH2… 162407 162758           162189           162407 -               
##  4 BPH2… 162407 162758           162189           162407 -               
##  5 BPH2… 174634 175871           174428           174634 +               
##  6 BPH2… 174634 175871           174428           174634 +               
##  7 BPH2… 178465 178709           178217           178465 +               
##  8 BPH2… 181179 181441           180931           181179 +               
##  9 BPH2… 231689 232169           230541           231689 +               
## 10 BPH2…  98412  98874            97246            98412 -               
## # … with 42,805 more rows, and 9 more variables:
## #   ATTRIBUTES_flank_prot_upstream <chr>, POS <dbl>, PAIR_ID <chr>,
## #   REFERENCE <chr>, ISOLATE <chr>, START_flank_prot_downstream <dbl>,
## #   END_flank_prot_downstream <dbl>, STRAND_flank_prot_downstream <chr>,
## #   ATTRIBUTES_flank_prot_downstream <chr>
rm(upstream_proteins, downstream_proteins)

cd-hit clustering of intergenic regions

intergenic_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.cd-hit.tab") %>%
  separate(id, into = c("CHROM", "START", "END"), sep = "[:-]", remove = F) %>%
  rename_at(vars(id, starts_with("clstr")),
            funs(str_c(., "_intergenic"))) %>%
  mutate_at(vars(START, END), as.numeric) %>%
  distinct()
## Parsed with column specification:
## cols(
##   id = col_character(),
##   clstr = col_double(),
##   clstr_size = col_double(),
##   length = col_double(),
##   clstr_rep = col_double(),
##   clstr_iden = col_character(),
##   clstr_cov = col_character()
## )
snippy_data_modified_proteins_intergenic_regions <- snippy_data_modified_proteins %>%
  left_join(intergenic_regions_annotated,
            by = c("CHROM", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")) %>%
  left_join(intergenic_clusters_data)
## Joining, by = c("CHROM", "START", "END")

Mortality switches and phenotypes

df_phenotypes <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/genetic_pairs_pheno_changes_mortality_switches.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso1 = col_character(),
##   iso1_ST = col_character(),
##   iso1_strain_group = col_character(),
##   iso1_mortality = col_character(),
##   iso1_included = col_logical(),
##   iso1_scv = col_logical(),
##   iso1_persister_type = col_character(),
##   iso1_sample_type = col_character(),
##   iso1_genes = col_character(),
##   iso1_CC = col_character(),
##   iso2_ST = col_character(),
##   iso2_strain_group = col_character(),
##   iso2_mortality = col_character(),
##   iso2_included = col_logical(),
##   iso2_scv = col_logical(),
##   iso2_persister_type = col_character(),
##   iso2_sample_type = col_character(),
##   iso2_genes = col_character(),
##   iso2_CC = col_character()
##   # ... with 2 more columns
## )
## See spec(...) for full column specifications.
# check available phenotypes
# mortality
df_phenotypes %>%
  select(iso1, iso2, switches) %>%
  distinct() %>%
  count(switches)
## # A tibble: 5 x 2
##   switches              n
##   <chr>             <int>
## 1 Died-Died            42
## 2 Died-Survived        20
## 3 Survived-Died        20
## 4 Survived-Survived   146
## 5 <NA>                  6
# most mortality phenotypes are not available. Need to recalculate that
df_all_genetic_pairs_pheno <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/df_all_genetic_pairs_pheno.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso2 = col_character(),
##   iso1 = col_character(),
##   iso1_ST = col_character(),
##   iso1_strain_group = col_character(),
##   iso1_mortality = col_character(),
##   iso1_included = col_logical(),
##   iso1_scv = col_logical(),
##   iso1_persister_type = col_character(),
##   iso1_sample_type = col_character(),
##   iso1_genes = col_character(),
##   iso1_CC = col_character(),
##   iso2_ST = col_character(),
##   iso2_strain_group = col_character(),
##   iso2_mortality = col_character(),
##   iso2_included = col_logical(),
##   iso2_scv = col_logical(),
##   iso2_persister_type = col_character(),
##   iso2_sample_type = col_character(),
##   iso2_genes = col_character(),
##   iso2_CC = col_character()
##   # ... with 1 more columns
## )
## See spec(...) for full column specifications.
df_all_genetic_pairs_pheno %>%
  select(iso1, iso2, iso1_mortality, iso2_mortality) %>%
  filter(is.na(iso1_mortality) | is.na(iso2_mortality))
## # A tibble: 90 x 4
##    iso1    iso2    iso1_mortality iso2_mortality
##    <chr>   <chr>   <chr>          <chr>         
##  1 BPH2751 BPH2750 Died           <NA>          
##  2 BPH2750 BPH2751 <NA>           Died          
##  3 BPH2888 BPH2887 Died           <NA>          
##  4 BPH2887 BPH2888 <NA>           Died          
##  5 BPH2898 BPH2897 Died           <NA>          
##  6 BPH2897 BPH2898 <NA>           Died          
##  7 BPH2966 BPH2965 Died           <NA>          
##  8 BPH2965 BPH2966 <NA>           Died          
##  9 BPH2988 BPH2987 Died           <NA>          
## 10 BPH2987 BPH2988 <NA>           Died          
## # … with 80 more rows
df_all_genetic_pairs_pheno_switches <- df_all_genetic_pairs_pheno %>%
  mutate(switches = str_c(iso1_mortality, "-", iso2_mortality))
 
# count
df_all_genetic_pairs_pheno_switches %>%
  select(iso1, iso2, switches) %>%
  distinct() %>%
  count(switches)
## # A tibble: 5 x 2
##   switches              n
##   <chr>             <int>
## 1 Died-Died            68
## 2 Died-Survived       316
## 3 Survived-Died       316
## 4 Survived-Survived  1694
## 5 <NA>                 90
# df_mutations_phenotypes <- snippy_data_modified_proteins_intergenic_regions %>%
#  left_join(df_phenotypes)

Analyse mortality switches

Generate dataset of clusters of mortality switches

df_mortality_close_clusters <- df_all_genetic_pairs_pheno_switches %>%
  arrange(iso2) %>%
  filter(switches %in% c("Survived-Died")) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  select(iso1, iso2, switches, iso_cluster) 



df_mortality_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 28
df_mortality_close_clusters %>%
  .$iso1 %>%
  n_distinct()
## [1] 76
df_mortality_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Alternative way to generate a dataset

df_mortality_close_pairs <- df_all_genetic_pairs_pheno_switches %>%
  arrange(iso2) %>%
  filter(switches %in% c("Survived-Died")) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  select(iso1, iso2, switches, iso_cluster) %>%
  arrange(iso2) %>%
  mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
  filter(!same_iso2) %>%
  select(iso1, iso2, switches, iso_cluster) %>%
  mutate(dataset = "pairs")

df_mortality_close_pairs %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 28
iso1_pairs <- sort(df_mortality_close_pairs$iso1)


iso1_reserve <- df_all_genetic_pairs_pheno_switches %>%
  arrange(iso2) %>%
  filter(switches %in% c("Survived-Died")) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  select(iso1, iso2, switches, iso_cluster) %>%
  filter(!iso1 %in% iso1_pairs) %>%
  .$iso1 %>%
  unique() %>%
  sort()

df_mortality_close_clusters_reserve <- df_all_genetic_pairs_pheno_switches %>%
  arrange(iso2) %>%
  filter(switches %in% c("Survived-Died")) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
  select(iso1, iso2, switches, iso_cluster) %>%
  filter(iso1 %in% iso1_reserve) %>%
  ungroup() %>%
  arrange(iso1) %>%
  distinct(iso1, .keep_all = T) %>%
  mutate(dataset = "reserve")

df_mortality_close_clusters_2 <- bind_rows(df_mortality_close_pairs, 
                                 df_mortality_close_clusters_reserve)



df_mortality_close_clusters_2 %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 28
df_mortality_close_clusters_2 %>%
  .$iso1 %>%
  n_distinct()
## [1] 76
df_mortality_close_clusters_2 %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Convergence analysis

df_mutations_phenotypes_mortality_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_mortality_close_clusters) 
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_clusters <- df_mutations_phenotypes_mortality_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, switches, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_mortality_clusters <- df_convergence_mortality_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))%>%
  mutate(phenotype = "SAB mortality") %>%
  arrange(desc(n_clusters))
t_mortality_clusters
## # A tibble: 689 x 13
## # Groups:   n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot,
## #   nebraska_locus_tag, neb_mutant_id, neb_gene [689]
##    n_clusters n_pairs clstr_prot clstr_size_prot SEQUENCE_prot nebraska_locus_…
##         <int>   <int>      <dbl>           <dbl> <chr>         <chr>           
##  1         13      25          7              73 MIMGNLRFQQEY… SAUSA300_0181   
##  2         13      31        358              30 MLFHKKIESLIS… SAUSA300_2167   
##  3         11      11        469              10 MKFSTLSEEEFT… SAUSA300_1141   
##  4         11      13       1105              49 MSNQNYDYNKNE… SAUSA300_0754   
##  5         11      17        569              20 MKIIHTADWHLG… SAUSA300_1242   
##  6         11      22        211              25 MAKLLIMSIVSF… <NA>            
##  7         11      54        140              56 MKFKSLITTTLA… <NA>            
##  8         11      55          0              61 MNYRDKIQKFSI… SAUSA300_1327   
##  9         10      17         65              27 MNMKKKEKHAIR… <NA>            
## 10         10      26        435              66 MELLNRYNFVLF… SAUSA300_1991   
## # … with 679 more rows, and 7 more variables: neb_mutant_id <chr>,
## #   neb_gene <chr>, neb_product <chr>, GENE <chr>, PRODUCT <chr>,
## #   MUTATION_SHORT <chr>, phenotype <chr>

Map on the phylogenetic tree

Convergence analysis of growth rate

Explore growth rate

df_phenotypes
## # A tibble: 234 x 111
##    iso2  iso1   dist iso1_ST iso1_strain_gro… iso1_mecA iso1_mortality
##    <chr> <chr> <dbl> <chr>   <chr>                <dbl> <chr>         
##  1 BPH2… BPH2…    21 93      VAN-004                  1 Survived      
##  2 BPH2… BPH2…    21 93      VAN-004                  1 Survived      
##  3 BPH2… BPH2…    20 97      VAN-007                  0 Survived      
##  4 BPH2… BPH2…    20 97      VAN-007                  0 Survived      
##  5 BPH2… BPH2…    18 239     VAN-064                  1 Survived      
##  6 BPH2… BPH3…    16 239     VSS-287                  1 Died          
##  7 BPH2… BPH2…    20 239     VAN-155                  1 Died          
##  8 BPH2… BPH2…    29 45      VAN-011                  0 Survived      
##  9 BPH2… BPH2…    29 45      VAN-011                  0 Survived      
## 10 BPH2… BPH2…    18 239     VAN-082                  1 Survived      
## # … with 224 more rows, and 104 more variables: iso1_intrahost_index <dbl>,
## #   iso1_included <lgl>, iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>,
## #   iso1_scv <lgl>, iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## #   iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## #   iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## #   iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## #   iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## #   iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## #   iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## #   iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## #   iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## #   iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## #   iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## #   iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## #   iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## #   iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## #   iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## #   iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## #   iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## #   iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## #   iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## #   iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## #   iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## #   iso2_time_of_max_death <dbl>, iso2_max_death <dbl>,
## #   iso2_time_of_min_death <dbl>, iso2_min_death <dbl>,
## #   iso2_time_of_max_rate_death <dbl>, iso2_max_rate_death <dbl>,
## #   iso2_doubling_time_death <dbl>, iso2_end_point_death <dbl>,
## #   iso2_end_point_timepoint_death <dbl>, delta_time_of_max_rate_OD <dbl>,
## #   delta_max_rate_OD <dbl>, delta_doubling_time_OD <dbl>, delta_AUC_OD <dbl>,
## #   delta_time_of_max_OD <dbl>, delta_time_of_min_OD <dbl>, delta_max_OD <dbl>,
## #   delta_min_OD <dbl>, delta_end_point_OD <dbl>,
## #   delta_time_of_max_rate_death <dbl>, delta_max_rate_death <dbl>,
## #   delta_doubling_time_death <dbl>, delta_AUC_death <dbl>,
## #   delta_time_of_max_death <dbl>, delta_time_of_min_death <dbl>,
## #   delta_max_death <dbl>, delta_min_death <dbl>, delta_end_point_death <dbl>,
## #   log2fc_time_of_max_rate_OD <dbl>, log2fc_max_rate_OD <dbl>,
## #   log2fc_doubling_time_OD <dbl>, log2fc_AUC_OD <dbl>,
## #   log2fc_time_of_max_OD <dbl>, log2fc_time_of_min_OD <dbl>,
## #   log2fc_max_OD <dbl>, log2fc_min_OD <dbl>, log2fc_end_point_OD <dbl>,
## #   log2fc_time_of_max_rate_death <dbl>, log2fc_max_rate_death <dbl>,
## #   log2fc_doubling_time_death <dbl>, log2fc_AUC_death <dbl>,
## #   log2fc_time_of_max_death <dbl>, log2fc_time_of_min_death <dbl>, …
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_OD")){
  p <- ggdensity(data = df_phenotypes, x = var, fill = "red") +
    theme_bw()
  
  print(p)
}

df_phenotypes %>%
  ggplot(aes(x = abs(delta_AUC_OD))) +
  geom_density(fill = "red") +
  theme_bw()

df_plot <- df_phenotypes %>%
  mutate_at(vars(matches("delta_.*_OD")),
            abs)

for (var in str_subset(colnames(df_plot), "delta_.*_OD")){
  p <- ggdensity(data = df_plot, x = var, fill = "red") +
    theme_bw()
  
  print(p)
}

Generate dataset of clusters of discordant growth

Based on the exploration above, we define a delta AUC threshold of 30 for discordant pairs (60 would be a more stringent one)

# df_growth_close_clusters <- df_phenotypes %>%
#   arrange(iso2) %>%
#   filter(delta_AUC_OD < - 10) %>%
#   group_by(iso2) %>%
#   mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
#   ungroup() %>%
#   distinct(iso1, .keep_all =T)  %>%
#   select(iso1, iso2, iso_cluster, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD)

df_growth_close_clusters <- df_phenotypes %>%
arrange(iso2) %>%
  filter(delta_AUC_OD < -10) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster"), 
         n = n()) %>%
  select(iso1, iso2, iso_cluster, n, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD) %>%
  arrange(iso1, n) %>%
  group_by(iso1) %>%
  mutate(keep = row_number() == 1) %>%
  filter(keep)

df_growth_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 22
df_growth_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Convergence analysis of growth

df_mutations_growth_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_growth_close_clusters) 
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_growth_close_clusters)
## [1] 1100
df_convergence_growth_clusters <- df_mutations_growth_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_OD, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_growth_clusters <- df_convergence_growth_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|"))) %>%
  mutate(phenotype = "Growth rate (AUC)") %>%
  arrange(desc(n_clusters))

Convergence analysis of cytotoxicity

Explore cytotoxicity

for (var in str_subset(colnames(df_phenotypes), "iso1_.*_death")){
  p <- ggdensity(data = df_phenotypes, x = var, fill = "blue") +
    theme_bw()
  
  print(p)
}

df_plot <- df_phenotypes %>%
  mutate_at(vars(matches("delta_.*_death")),
            abs)

for (var in str_subset(colnames(df_plot), "delta_.*_death")){
  p <- ggdensity(data = df_plot, x = var, fill = "blue") +
    theme_bw()
  
  print(p)
}

Generate dataset of clusters of discordant cell death

# df_cell_death_close_clusters <- df_phenotypes %>%
#   arrange(iso2) %>%
#   filter(delta_AUC_death < - 40) %>%
#   group_by(iso2) %>%
#   mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
#   ungroup() %>%
#   distinct(iso1, .keep_all =T)  %>%
#   select(iso1, iso2, iso_cluster, delta_AUC_death, iso1_AUC_death, iso2_AUC_death)

df_cell_death_close_clusters <- df_phenotypes %>%
  arrange(iso2) %>%
  filter(delta_AUC_death < - 40) %>%
  group_by(iso2) %>%
  mutate(iso_cluster = str_c(iso2, "-cluster"), 
         n = n()) %>%
  select(iso1, iso2, iso_cluster, n, delta_AUC_death, iso1_AUC_death, iso2_AUC_death) %>%
  arrange(iso1, n) %>%
  group_by(iso1) %>%
  mutate(keep = row_number() == 1) %>%
  filter(keep)


df_cell_death_close_clusters %>%
  .$iso_cluster %>%
  n_distinct()
## [1] 13
df_cell_death_close_clusters %>%
  ggplot(aes(x = fct_infreq(iso_cluster))) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Convergence analysis of cytotoxicity

df_mutations_cell_death_close_clusters <- snippy_data_modified_proteins %>%
  right_join(df_cell_death_close_clusters) 
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_cell_death_close_clusters)
## [1] 857
df_convergence_cell_death_clusters <- df_mutations_cell_death_close_clusters %>%
  drop_na(PRODUCT) %>%
  filter(EFFTYPE != "synonymous_variant") %>%
  group_by(clstr_prot, clstr_size_prot) %>%
   mutate(n_clusters = n_distinct(iso_cluster), 
          n_pairs = n_distinct(PAIR_ID),
         n_references = n_distinct(REFERENCE)) %>%
  select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_death, everything()) %>%
  arrange(clstr_prot, desc(n_clusters))

t_cell_death_clusters <- df_convergence_cell_death_clusters %>%
  group_by(clstr_prot) %>%
  mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
  distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|"))) %>%
  mutate(phenotype = "Cell death (AUC)")

Merge datasets

t <- bind_rows(t_mortality_clusters,
               t_growth_clusters,
               t_cell_death_clusters)

t_convergent <- t %>%
  ungroup() %>%
  select(clstr_prot, starts_with("neb"), n_clusters, phenotype) %>%
  pivot_wider(names_from = phenotype, names_prefix = "n_clusters_", values_from = n_clusters, values_fill = list(n_clusters = 0)) %>%
  left_join(t) %>%
   group_by_at(vars(clstr_prot, starts_with("n_clusters_"),
                    starts_with("neb"))) %>%
  summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
               funs(str_c(unique(.), collapse = "|")))
## Joining, by = c("clstr_prot", "nebraska_locus_tag", "neb_mutant_id", "neb_gene", "neb_product")

Summarise data (for Venn diagram): all mutated genes

t_venn <- t_convergent %>%
  ungroup() %>%
  mutate_at(vars(starts_with("n_clusters")),
            funs(mutated = as.integer(. > 0))) %>%
  unite(col = "intersect", ends_with("mutated"), sep = "") %>%
  mutate(intersect = recode(intersect,
                            `111` = "all",
                            `100` = "SAB mortality only",
                            `110` = "SAB mortality and growth rate",
                            `101` = "SAB mortality and cell death",
                            `011` = "Growth rate and cell death",
                            `010` = "Growth rate only",
                            `011` = "Growth rate and cell death",
                            `001` = "Cell death only"))

t_venn_synthesis <- t_venn %>%
  count(intersect)

Summarise data (for Venn diagram): genes mutated at least 2 times (convergent)

t_venn_stringent <- t_convergent %>%
  ungroup() %>%
  mutate_at(vars(starts_with("n_clusters")),
            funs(mutated = as.integer(. > 1))) %>%
  unite(col = "intersect", ends_with("mutated"), sep = "") %>%
  filter(intersect != "000") %>%
  mutate(intersect = recode(intersect,
                            `111` = "all",
                            `100` = "SAB mortality only",
                            `110` = "SAB mortality and growth rate",
                            `101` = "SAB mortality and cell death",
                            `011` = "Growth rate and cell death",
                            `010` = "Growth rate only",
                            `011` = "Growth rate and cell death",
                            `001` = "Cell death only"))

t_venn_stringent_synthesis <- t_venn_stringent %>%
  count(intersect)

Plot pairs on the tree

Mortality tree

List of paired isolates included in the paired analysis

df_mortality_close_clusters_long <- df_mortality_close_clusters %>%
  select(-switches) %>%
  ungroup() %>%
  pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
  distinct() %>%
  transmute(ISOLATE, iso_cluster, mortality = if_else(iso_rank == "iso2", "Died", "Survived"))

# Add some metadata
df_genetic_pairs_unique <- df_all_genetic_pairs_pheno %>%
  select(starts_with("iso1")) %>%
  distinct() %>%
  rename(ISOLATE = iso1) %>%
  rename_at(vars(starts_with("iso1")),
            funs(str_remove(., "iso1_")))
df_mortality_close_clusters_long_with_metadata <- df_mortality_close_clusters_long %>%
  left_join(df_genetic_pairs_unique)
## Joining, by = c("ISOLATE", "mortality")
# List of unique isolates
mortality_close_clusters <- sort(unique(df_mortality_close_clusters_long$ISOLATE))
write_lines(mortality_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/mortality_close_clusters.txt")

Import tree

mortality_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/mortality_close_clusters.tree") %>%
  phytools::midpoint.root()
p <- ggtree(mortality_tree, layout = "circular")

Annotate tree with mortality and convergent genes

metadata_matrix <- df_mortality_close_clusters_long_with_metadata %>%
  select(ISOLATE, CC, mortality) %>%
  distinct() %>%
  column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(mortality)) +
  scale_fill_manual(values = c("orange", "navy"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2

library(ggnewscale)
p3 <- p2 + new_scale_fill()

p4 <- gheatmap(p3, metadata_matrix %>%
           select(CC), offset = .05) +
  scale_fill_viridis_d()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
top_genes <- t_mortality_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_mortality_clusters %>%
  ungroup() %>%
  filter(clstr_prot %in% top_genes$clstr_prot) %>%
  distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
  distinct(ISOLATE, clstr_prot) %>%
  mutate(mutated = T) %>%
  pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
  column_to_rownames("ISOLATE")

p5 <- p4 + new_scale_fill() 

p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
  scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6

Growth rate tree

List of paired isolates included in the paired analysis

df_growth_close_clusters_long <- df_growth_close_clusters %>%
  ungroup() %>%
  select(iso1, iso2, iso_cluster) %>%
  pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
  distinct(ISOLATE, iso_cluster) 

# Add some metadata
df_genetic_pairs_unique <- df_all_genetic_pairs_pheno %>%
  select(starts_with("iso1")) %>%
  distinct() %>%
  rename(ISOLATE = iso1) %>%
  rename_at(vars(starts_with("iso1")),
            funs(str_remove(., "iso1_")))
df_growth_close_clusters_long_with_metadata <- df_growth_close_clusters_long %>%
  left_join(df_genetic_pairs_unique)
## Joining, by = "ISOLATE"
# List of unique isolates
growth_close_clusters <- sort(unique(df_growth_close_clusters_long$ISOLATE))
write_lines(growth_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/growth_close_clusters.txt")

Import tree

growth_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/growth_close_clusters.tree") %>%
  phytools::midpoint.root()
p <- ggtree(growth_tree, layout = "circular")

Annotate tree with mortality and convergent genes

metadata_matrix <- df_growth_close_clusters_long_with_metadata %>%
  select(ISOLATE, CC, AUC_OD) %>%
  distinct() %>%
  column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(AUC_OD)) +
  scale_fill_viridis_c()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2

library(ggnewscale)
# p3 <- p2 + new_scale_fill()

# p4 <- gheatmap(p3, metadata_matrix %>%
#            select(CC), offset = .2) +
#   scale_fill_viridis_d()
# p4

top_genes <- t_growth_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_growth_clusters %>%
  ungroup() %>%
  filter(clstr_prot %in% top_genes$clstr_prot) %>%
  distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
  distinct(ISOLATE, clstr_prot) %>%
  mutate(mutated = T) %>%
  pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
  column_to_rownames("ISOLATE")

p5 <- p2 + new_scale_fill() 

p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
  scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6

Cell death tree

List of paired isolates included in the paired analysis

df_cell_death_close_clusters_long <- df_cell_death_close_clusters %>%
  ungroup() %>%
  select(iso1, iso2, iso_cluster) %>%
  pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
  distinct(ISOLATE, iso_cluster) 

# Add some metadata
df_cell_death_close_clusters_long_with_metadata <- df_cell_death_close_clusters_long %>%
  left_join(df_genetic_pairs_unique)
## Joining, by = "ISOLATE"
# List of unique isolates
cell_death_close_clusters <- sort(unique(df_growth_close_clusters_long$ISOLATE))
write_lines(cell_death_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/cell_death_close_clusters.txt")

Import tree

growth_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/cell_death_close_clusters.tree") %>%
  phytools::midpoint.root()
p <- ggtree(growth_tree, layout = "circular")

Annotate tree with mortality and convergent genes

metadata_matrix <- df_cell_death_close_clusters_long_with_metadata %>%
  select(ISOLATE, CC, AUC_death) %>%
  distinct() %>%
  column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(AUC_death)) +
  scale_fill_viridis_c()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2

library(ggnewscale)
# p3 <- p2 + new_scale_fill()

# p4 <- gheatmap(p3, metadata_matrix %>%
#            select(CC), offset = .2) +
#   scale_fill_viridis_d()
# p4

top_genes <- t_cell_death_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_cell_death_clusters %>%
  ungroup() %>%
  filter(clstr_prot %in% top_genes$clstr_prot) %>%
  distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
  distinct(ISOLATE, clstr_prot) %>%
  mutate(mutated = T) %>%
  pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
  column_to_rownames("ISOLATE")

p5 <- p2 + new_scale_fill() 

p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
  scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6